Jobs Preddiction by using Machine Learning¶

Import Libraries¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro 

from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import plot_tree, DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

from sklearn.svm import SVC

Data Gathering¶

In [2]:
df=pd.read_csv(r"D:\Excel project\energy_dataset_.csv")
df
Out[2]:
Type_of_Renewable_Energy Installed_Capacity_MW Energy_Production_MWh Energy_Consumption_MWh Energy_Storage_Capacity_MWh Storage_Efficiency_Percentage Grid_Integration_Level Initial_Investment_USD Funding_Sources Financial_Incentives_USD GHG_Emission_Reduction_tCO2e Air_Pollution_Reduction_Index Jobs_Created
0 4 93.423205 103853.2206 248708.4892 2953.248771 89.887562 4 4.732248e+08 1 9.207772e+06 6663.816572 81.742461 1366
1 4 590.468942 190223.0649 166104.1642 5305.174042 84.403343 4 1.670697e+08 2 1.685101e+06 30656.049820 78.139042 1743
2 1 625.951142 266023.4824 424114.6308 2620.192622 60.498249 2 8.463610e+07 2 5.111813e+06 1749.613759 8.461296 363
3 1 779.998728 487039.5296 308337.7316 1925.250307 86.897861 3 3.967690e+08 2 4.805902e+06 43233.237820 8.402441 2821
4 3 242.106837 482815.0856 360437.7705 3948.945383 70.949351 2 3.574413e+07 1 1.668601e+07 14858.662760 28.822867 2583
... ... ... ... ... ... ... ... ... ... ... ... ... ...
14995 3 745.032555 280007.5738 230544.8268 4351.687893 90.791405 4 3.484136e+08 2 1.558508e+07 25234.911810 78.923200 1452
14996 1 15.187023 377340.5803 358547.3589 6792.194696 78.252040 4 2.560179e+08 3 6.866618e+06 15762.519790 54.982974 2598
14997 3 877.539059 480497.3920 214441.6719 4588.725297 58.282928 1 1.300112e+08 2 3.837764e+06 44597.809410 43.915897 2713
14998 7 551.264716 436383.1694 137043.8713 7251.144215 73.573666 2 3.334831e+08 2 5.347706e+06 34363.858000 4.877145 2128
14999 3 863.421803 314014.3005 124461.9178 7366.166362 86.868893 4 2.989825e+08 2 1.832534e+07 27193.217600 50.559075 1871

15000 rows × 13 columns

EDA - Exploratory Data Analysis¶

In [3]:
df.size
Out[3]:
195000
In [4]:
df.shape
Out[4]:
(15000, 13)
In [5]:
df.axes
Out[5]:
[RangeIndex(start=0, stop=15000, step=1),
 Index(['Type_of_Renewable_Energy', 'Installed_Capacity_MW',
        'Energy_Production_MWh', 'Energy_Consumption_MWh',
        'Energy_Storage_Capacity_MWh', 'Storage_Efficiency_Percentage',
        'Grid_Integration_Level', 'Initial_Investment_USD', 'Funding_Sources',
        'Financial_Incentives_USD', 'GHG_Emission_Reduction_tCO2e',
        'Air_Pollution_Reduction_Index', 'Jobs_Created'],
       dtype='object')]
In [6]:
df.columns
Out[6]:
Index(['Type_of_Renewable_Energy', 'Installed_Capacity_MW',
       'Energy_Production_MWh', 'Energy_Consumption_MWh',
       'Energy_Storage_Capacity_MWh', 'Storage_Efficiency_Percentage',
       'Grid_Integration_Level', 'Initial_Investment_USD', 'Funding_Sources',
       'Financial_Incentives_USD', 'GHG_Emission_Reduction_tCO2e',
       'Air_Pollution_Reduction_Index', 'Jobs_Created'],
      dtype='object')
In [7]:
df.describe()
Out[7]:
Type_of_Renewable_Energy Installed_Capacity_MW Energy_Production_MWh Energy_Consumption_MWh Energy_Storage_Capacity_MWh Storage_Efficiency_Percentage Grid_Integration_Level Initial_Investment_USD Funding_Sources Financial_Incentives_USD GHG_Emission_Reduction_tCO2e Air_Pollution_Reduction_Index Jobs_Created
count 15000.000000 15000.000000 15000.000000 15000.000000 15000.000000 15000.000000 15000.000000 1.500000e+04 15000.000000 1.500000e+04 15000.000000 15000.000000 15000.000000
mean 3.973933 495.855747 252350.955621 225981.854966 5030.196472 75.219334 2.501267 2.514849e+08 2.004000 1.002977e+07 25234.722158 50.724179 2502.668600
std 1.999380 288.212872 144062.915425 129223.677997 2894.096326 14.485289 1.123306 1.432843e+08 0.817493 5.787303e+06 14378.915277 28.556578 1451.212661
min 1.000000 1.091767 1030.103692 584.048006 2.200208 50.003494 1.000000 1.008107e+06 1.000000 5.155842e+04 100.974460 1.009712 10.000000
25% 2.000000 245.475737 128568.875900 115587.428225 2543.341380 62.694076 1.000000 1.259709e+08 1.000000 4.963454e+06 12754.476927 26.224426 1228.000000
50% 4.000000 492.907555 253216.871250 225226.374350 5054.036248 75.279702 3.000000 2.539910e+08 2.000000 1.002414e+07 25424.477000 50.250207 2496.000000
75% 6.000000 742.254682 377083.805575 338656.214700 7536.935405 87.754318 4.000000 3.759130e+08 3.000000 1.507417e+07 37750.426285 75.720133 3765.000000
max 7.000000 999.982979 499991.200400 449922.667800 9999.145037 99.994955 4.000000 4.999407e+08 3.000000 1.999855e+07 49997.578530 99.980494 4999.000000
In [8]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Type_of_Renewable_Energy       15000 non-null  int64  
 1   Installed_Capacity_MW          15000 non-null  float64
 2   Energy_Production_MWh          15000 non-null  float64
 3   Energy_Consumption_MWh         15000 non-null  float64
 4   Energy_Storage_Capacity_MWh    15000 non-null  float64
 5   Storage_Efficiency_Percentage  15000 non-null  float64
 6   Grid_Integration_Level         15000 non-null  int64  
 7   Initial_Investment_USD         15000 non-null  float64
 8   Funding_Sources                15000 non-null  int64  
 9   Financial_Incentives_USD       15000 non-null  float64
 10  GHG_Emission_Reduction_tCO2e   15000 non-null  float64
 11  Air_Pollution_Reduction_Index  15000 non-null  float64
 12  Jobs_Created                   15000 non-null  int64  
dtypes: float64(9), int64(4)
memory usage: 1.5 MB
In [9]:
df.isna().sum()
Out[9]:
Type_of_Renewable_Energy         0
Installed_Capacity_MW            0
Energy_Production_MWh            0
Energy_Consumption_MWh           0
Energy_Storage_Capacity_MWh      0
Storage_Efficiency_Percentage    0
Grid_Integration_Level           0
Initial_Investment_USD           0
Funding_Sources                  0
Financial_Incentives_USD         0
GHG_Emission_Reduction_tCO2e     0
Air_Pollution_Reduction_Index    0
Jobs_Created                     0
dtype: int64
In [10]:
df.dtypes
Out[10]:
Type_of_Renewable_Energy           int64
Installed_Capacity_MW            float64
Energy_Production_MWh            float64
Energy_Consumption_MWh           float64
Energy_Storage_Capacity_MWh      float64
Storage_Efficiency_Percentage    float64
Grid_Integration_Level             int64
Initial_Investment_USD           float64
Funding_Sources                    int64
Financial_Incentives_USD         float64
GHG_Emission_Reduction_tCO2e     float64
Air_Pollution_Reduction_Index    float64
Jobs_Created                       int64
dtype: object
In [11]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(),annot=True,cmap='rainbow')
Out[11]:
<Axes: >
In [12]:
df.var()
Out[12]:
Type_of_Renewable_Energy         3.997520e+00
Installed_Capacity_MW            8.306666e+04
Energy_Production_MWh            2.075412e+10
Energy_Consumption_MWh           1.669876e+10
Energy_Storage_Capacity_MWh      8.375794e+06
Storage_Efficiency_Percentage    2.098236e+02
Grid_Integration_Level           1.261816e+00
Initial_Investment_USD           2.053038e+16
Funding_Sources                  6.682952e-01
Financial_Incentives_USD         3.349287e+13
GHG_Emission_Reduction_tCO2e     2.067532e+08
Air_Pollution_Reduction_Index    8.154781e+02
Jobs_Created                     2.106018e+06
dtype: float64
In [13]:
df.cov()
Out[13]:
Type_of_Renewable_Energy Installed_Capacity_MW Energy_Production_MWh Energy_Consumption_MWh Energy_Storage_Capacity_MWh Storage_Efficiency_Percentage Grid_Integration_Level Initial_Investment_USD Funding_Sources Financial_Incentives_USD GHG_Emission_Reduction_tCO2e Air_Pollution_Reduction_Index Jobs_Created
Type_of_Renewable_Energy 3.997520e+00 -3.625395e-01 1.215235e+03 -3.220774e+02 1.917457e+01 -1.123573e-01 -0.014068 -4.578642e+06 -0.010630 -2.063998e+04 4.161362e+02 -4.239348e-01 4.744598e-01
Installed_Capacity_MW -3.625395e-01 8.306666e+04 -4.782348e+05 2.477787e+05 9.589998e+03 1.914227e+00 -3.490605 2.676324e+08 -0.499094 5.681402e+06 5.716652e+04 -6.180542e+01 -5.577267e+02
Energy_Production_MWh 1.215235e+03 -4.782348e+05 2.075412e+10 -6.972231e+06 -1.747641e+06 5.013188e+02 -417.277793 -1.254754e+11 -1267.525076 1.062634e+09 -8.600616e+06 -3.206321e+04 -4.846938e+05
Energy_Consumption_MWh -3.220774e+02 2.477787e+05 -6.972231e+06 1.669876e+10 -4.565008e+05 1.133853e+04 -397.011971 -3.983686e+11 -311.265800 7.976445e+09 1.898074e+07 1.103052e+04 1.865765e+06
Energy_Storage_Capacity_MWh 1.917457e+01 9.589998e+03 -1.747641e+06 -4.565008e+05 8.375794e+06 4.318747e+02 -24.887617 1.163868e+09 18.430238 2.370717e+08 -7.279311e+04 -5.027205e+02 4.037827e+03
Storage_Efficiency_Percentage -1.123573e-01 1.914227e+00 5.013188e+02 1.133853e+04 4.318747e+02 2.098236e+02 0.311152 7.897642e+06 -0.049555 2.316351e+04 1.381326e+03 3.198205e-01 2.009514e+02
Grid_Integration_Level -1.406792e-02 -3.490605e+00 -4.172778e+02 -3.970120e+02 -2.488762e+01 3.111520e-01 1.261816 -2.401866e+05 0.007995 -1.469872e+04 2.196670e+00 1.425433e-01 1.336476e+00
Initial_Investment_USD -4.578642e+06 2.676324e+08 -1.254754e+11 -3.983686e+11 1.163868e+09 7.897642e+06 -240186.628659 2.053038e+16 -473835.832677 1.840326e+12 -2.267804e+10 -7.335563e+06 -4.683964e+08
Funding_Sources -1.062978e-02 -4.990944e-01 -1.267525e+03 -3.112658e+02 1.843024e+01 -4.955450e-02 0.007995 -4.738358e+05 0.668295 -1.125168e+04 1.344563e+01 -5.655922e-02 1.443289e+01
Financial_Incentives_USD -2.063998e+04 5.681402e+06 1.062634e+09 7.976445e+09 2.370717e+08 2.316351e+04 -14698.721965 1.840326e+12 -11251.676358 3.349287e+13 3.112938e+08 -6.781600e+05 -1.404093e+08
GHG_Emission_Reduction_tCO2e 4.161362e+02 5.716652e+04 -8.600616e+06 1.898074e+07 -7.279311e+04 1.381326e+03 2.196670 -2.267804e+10 13.445634 3.112938e+08 2.067532e+08 -4.218332e+03 2.538464e+04
Air_Pollution_Reduction_Index -4.239348e-01 -6.180542e+01 -3.206321e+04 1.103052e+04 -5.027205e+02 3.198205e-01 0.142543 -7.335563e+06 -0.056559 -6.781600e+05 -4.218332e+03 8.154781e+02 1.882566e+02
Jobs_Created 4.744598e-01 -5.577267e+02 -4.846938e+05 1.865765e+06 4.037827e+03 2.009514e+02 1.336476 -4.683964e+08 14.432888 -1.404093e+08 2.538464e+04 1.882566e+02 2.106018e+06
In [14]:
df.std()
Out[14]:
Type_of_Renewable_Energy         1.999380e+00
Installed_Capacity_MW            2.882129e+02
Energy_Production_MWh            1.440629e+05
Energy_Consumption_MWh           1.292237e+05
Energy_Storage_Capacity_MWh      2.894096e+03
Storage_Efficiency_Percentage    1.448529e+01
Grid_Integration_Level           1.123306e+00
Initial_Investment_USD           1.432843e+08
Funding_Sources                  8.174933e-01
Financial_Incentives_USD         5.787303e+06
GHG_Emission_Reduction_tCO2e     1.437892e+04
Air_Pollution_Reduction_Index    2.855658e+01
Jobs_Created                     1.451213e+03
dtype: float64
In [15]:
sns.pairplot(df)
plt.title('Jobs_Created')
plt.show()
In [15]:
plt.scatter(df['Type_of_Renewable_Energy'], df['Jobs_Created'])
plt.show()
In [16]:
sns.boxplot(df["Type_of_Renewable_Energy"])
Out[16]:
<Axes: >
In [17]:
sns.boxplot(df["Installed_Capacity_MW"])
Out[17]:
<Axes: >
In [18]:
sns.boxplot(df["Installed_Capacity_MW"])
Out[18]:
<Axes: >
In [19]:
sns.boxplot(df["Energy_Production_MWh"])
Out[19]:
<Axes: >
In [20]:
sns.boxplot(df["Energy_Consumption_MWh"])
Out[20]:
<Axes: >
In [21]:
sns.boxplot(df["Energy_Storage_Capacity_MWh"])
Out[21]:
<Axes: >
In [22]:
sns.boxplot(df["Storage_Efficiency_Percentage"])
Out[22]:
<Axes: >
In [23]:
sns.boxplot(df["Grid_Integration_Level"])
Out[23]:
<Axes: >
In [24]:
sns.boxplot(df["Initial_Investment_USD"])
Out[24]:
<Axes: >
In [25]:
sns.boxplot(df["Funding_Sources"])
Out[25]:
<Axes: >
In [26]:
sns.boxplot(df["Financial_Incentives_USD"])
Out[26]:
<Axes: >

assimption 2 no multicoliniarity¶

In [27]:
df.shape
Out[27]:
(15000, 13)
In [28]:
df1 = df.iloc[:, :12]
df1
Out[28]:
Type_of_Renewable_Energy Installed_Capacity_MW Energy_Production_MWh Energy_Consumption_MWh Energy_Storage_Capacity_MWh Storage_Efficiency_Percentage Grid_Integration_Level Initial_Investment_USD Funding_Sources Financial_Incentives_USD GHG_Emission_Reduction_tCO2e Air_Pollution_Reduction_Index
0 4 93.423205 103853.2206 248708.4892 2953.248771 89.887562 4 4.732248e+08 1 9.207772e+06 6663.816572 81.742461
1 4 590.468942 190223.0649 166104.1642 5305.174042 84.403343 4 1.670697e+08 2 1.685101e+06 30656.049820 78.139042
2 1 625.951142 266023.4824 424114.6308 2620.192622 60.498249 2 8.463610e+07 2 5.111813e+06 1749.613759 8.461296
3 1 779.998728 487039.5296 308337.7316 1925.250307 86.897861 3 3.967690e+08 2 4.805902e+06 43233.237820 8.402441
4 3 242.106837 482815.0856 360437.7705 3948.945383 70.949351 2 3.574413e+07 1 1.668601e+07 14858.662760 28.822867
... ... ... ... ... ... ... ... ... ... ... ... ...
14995 3 745.032555 280007.5738 230544.8268 4351.687893 90.791405 4 3.484136e+08 2 1.558508e+07 25234.911810 78.923200
14996 1 15.187023 377340.5803 358547.3589 6792.194696 78.252040 4 2.560179e+08 3 6.866618e+06 15762.519790 54.982974
14997 3 877.539059 480497.3920 214441.6719 4588.725297 58.282928 1 1.300112e+08 2 3.837764e+06 44597.809410 43.915897
14998 7 551.264716 436383.1694 137043.8713 7251.144215 73.573666 2 3.334831e+08 2 5.347706e+06 34363.858000 4.877145
14999 3 863.421803 314014.3005 124461.9178 7366.166362 86.868893 4 2.989825e+08 2 1.832534e+07 27193.217600 50.559075

15000 rows × 12 columns

In [29]:
vif_df = pd.DataFrame()

vif_df["independant Features"] = df1.columns

vif_df
Out[29]:
independant Features
0 Type_of_Renewable_Energy
1 Installed_Capacity_MW
2 Energy_Production_MWh
3 Energy_Consumption_MWh
4 Energy_Storage_Capacity_MWh
5 Storage_Efficiency_Percentage
6 Grid_Integration_Level
7 Initial_Investment_USD
8 Funding_Sources
9 Financial_Incentives_USD
10 GHG_Emission_Reduction_tCO2e
11 Air_Pollution_Reduction_Index
In [30]:
vif_list = []


for i in range(df1.shape[1]):
    vif = variance_inflation_factor(df1.to_numpy(),i)
    vif_list.append(vif)
In [31]:
vif_df["VIF"] = vif_list
vif_df
Out[31]:
independant Features VIF
0 Type_of_Renewable_Energy 4.643886
1 Installed_Capacity_MW 3.789933
2 Energy_Production_MWh 3.866499
3 Energy_Consumption_MWh 3.881722
4 Energy_Storage_Capacity_MWh 3.860215
5 Storage_Efficiency_Percentage 17.040044
6 Grid_Integration_Level 5.548451
7 Initial_Investment_USD 3.880253
8 Funding_Sources 6.360197
9 Financial_Incentives_USD 3.833911
10 GHG_Emission_Reduction_tCO2e 3.909498
11 Air_Pollution_Reduction_Index 3.946135
In [32]:
df.shape
Out[32]:
(15000, 13)
In [33]:
x = df1         
y = df["Jobs_Created"] 

train test split¶

In [34]:
xtrain,xtest,ytrain,ytest = train_test_split(x, y, test_size=0.3, random_state=11)
In [35]:
xtrain.shape
Out[35]:
(10500, 12)
In [36]:
ytrain.shape
Out[36]:
(10500,)
In [37]:
xtest.shape
Out[37]:
(4500, 12)
In [38]:
ytest.shape
Out[38]:
(4500,)

Model Training¶

Algorithm 1 : Logistic Regression¶

In [39]:
log_reg = LogisticRegression()
log_reg
Out[39]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [40]:
log_reg_model = log_reg.fit(xtrain, ytrain)
log_reg_model
C:\Users\prajw\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Out[40]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [41]:
ytrain.head(10)
Out[41]:
14314    4050
2415     2670
8169     1737
12199    1631
2737     3030
12032    3382
12452     148
861      4120
4661     2168
8245     4360
Name: Jobs_Created, dtype: int64
In [42]:
ytrain_pred = log_reg_model.predict(xtrain)
ytrain_pred
Out[42]:
array([1045, 4888, 1045, ..., 1045, 4054, 3470], dtype=int64)

Model Evaluation for trining¶

In [43]:
ytrain_pred = log_reg_model.predict(xtrain)
In [44]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 3796829.2994285715
mean absolute error : 1588.490476190476
Root mean squared error : 1948.5454317076037
R2 score: -0.7948788777442457

Model Evaluation Fore Testing¶

In [45]:
ytest_pred = log_reg_model.predict(xtest)
In [46]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 3812043.5486666667
mean absolute error : 1594.8197777777777
Root mean squared error : 1952.4455302688132
R2 score: -0.830701998967226

Algorithm 2 : Decision Tree¶

In [47]:
dt_reg = DecisionTreeRegressor()
dt_reg
Out[47]:
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
In [48]:
dt_reg_model = dt_reg.fit(xtrain,ytrain)
dt_reg_model
Out[48]:
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()

model Evaluation for training¶

In [49]:
ytrain_pred = dt_reg_model.predict(xtrain)
In [50]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 0.0
mean absolute error : 0.0
Root mean squared error : 0.0
R2 score: 1.0

model Evaluation for Testing¶

In [51]:
ytest_pred = dt_reg_model.predict(xtest)
In [52]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 4294379.726666667
mean absolute error : 1696.8528888888889
Root mean squared error : 2072.288523991451
R2 score: -1.0623399102255227

Decision Tree With Hyperparameter Tunning¶

In [53]:
hyperparameters = {
    "criterion" : ["squared_error", "absolute_error", "friedman_mse", "poisson"],
    "max_depth" : np.arange(2,50),
    "min_samples_split": np.arange(2,10),
    "min_samples_leaf" : np.arange(2,15)
}
print(hyperparameters)
{'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'], 'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]), 'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]), 'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])}
In [54]:
rscv = RandomizedSearchCV(dt_reg_model, hyperparameters, cv=5) #cv : cross validation
rscv
Out[54]:
RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'friedman_mse',
                                                      'poisson'],
                                        'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                                        'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9])})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'friedman_mse',
                                                      'poisson'],
                                        'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                                        'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9])})
DecisionTreeRegressor()
DecisionTreeRegressor()
In [56]:
rscv_reg = rscv.fit(xtrain, ytrain)
rscv_reg
Out[56]:
RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'friedman_mse',
                                                      'poisson'],
                                        'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                                        'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9])})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'friedman_mse',
                                                      'poisson'],
                                        'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                                        'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9])})
DecisionTreeRegressor()
DecisionTreeRegressor()
In [57]:
rscv_reg.best_estimator_
Out[57]:
DecisionTreeRegressor(max_depth=3, min_samples_leaf=7, min_samples_split=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(max_depth=3, min_samples_leaf=7, min_samples_split=7)
In [58]:
rscv_reg_hyp = DecisionTreeRegressor(criterion='poisson', max_depth=10, min_samples_leaf=3,
                      min_samples_split=7)
rscv_reg_hyp
Out[58]:
DecisionTreeRegressor(criterion='poisson', max_depth=10, min_samples_leaf=3,
                      min_samples_split=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(criterion='poisson', max_depth=10, min_samples_leaf=3,
                      min_samples_split=7)
In [59]:
dt_reg_hyp_model = rscv_reg_hyp.fit(xtrain,ytrain)
dt_reg_hyp_model
Out[59]:
DecisionTreeRegressor(criterion='poisson', max_depth=10, min_samples_leaf=3,
                      min_samples_split=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(criterion='poisson', max_depth=10, min_samples_leaf=3,
                      min_samples_split=7)

model Evaluation for training¶

In [60]:
ytrain_pred = dt_reg_hyp_model.predict(xtrain)
In [61]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 1922712.719504361
mean absolute error : 1177.1810715617278
Root mean squared error : 1386.619168879603
R2 score: 0.09107411051427561

model Evaluation for Testing¶

In [62]:
ytest_pred = dt_reg_hyp_model.predict(xtest)
In [63]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 2294414.3971356065
mean absolute error : 1297.390343694187
Root mean squared error : 1514.7324506775467
R2 score: -0.10187330487462609
In [64]:
plot_tree(dt_reg_hyp_model, feature_names=df.columns,class_names= list("ABCDEF"), filled=True)
plt.savefig("dt_hyp_model.png")

Algorithm 3 : Random Forest¶

In [65]:
rf_reg = RandomForestRegressor()
rf_reg
Out[65]:
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()
In [66]:
rf_reg_model = rf_reg.fit(xtrain,ytrain)
rf_reg_model
Out[66]:
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestRegressor()

model Evaluation for training¶

In [67]:
ytrain_pred = rf_reg_model.predict(xtrain)
In [68]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 300817.0957437714
mean absolute error : 468.8258590476191
Root mean squared error : 548.4679532513923
R2 score: 0.8577944361901852

model Evaluation for Testing¶

In [69]:
ytest_pred = rf_reg_model.predict(xtest)
In [70]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 2141911.119747
mean absolute error : 1265.6518688888889
Root mean squared error : 1463.526945343679
R2 score: -0.028634882700244946

Random Forest With Hyperparameter Tunning¶

In [71]:
hyperparameters = {
    "n_estimators" : np.arange(2,50),
    "criterion" : ["squared_error", "absolute_error", "friedman_mse", "poisson"],
    "max_depth" : np.arange(2,50),
    "min_samples_split": np.arange(2,10),
    "min_samples_leaf" : np.arange(2,15)
}
print(hyperparameters)
{'n_estimators': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]), 'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'], 'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]), 'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]), 'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])}
In [72]:
rscv_rf = RandomizedSearchCV(rf_reg_model, hyperparameters, cv=5)
rscv_rf
Out[72]:
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'friedman_mse',
                                                      'poisson'],
                                        'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                                        'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]),
                                        'n_estimators': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'friedman_mse',
                                                      'poisson'],
                                        'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                                        'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9]),
                                        'n_estimators': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])})
RandomForestRegressor()
RandomForestRegressor()
In [73]:
rscv_rf_reg = rscv_reg.fit(xtrain,ytrain)
rscv_rf_reg
Out[73]:
RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'friedman_mse',
                                                      'poisson'],
                                        'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                                        'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9])})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(),
                   param_distributions={'criterion': ['squared_error',
                                                      'absolute_error',
                                                      'friedman_mse',
                                                      'poisson'],
                                        'max_depth': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                                        'min_samples_leaf': array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14]),
                                        'min_samples_split': array([2, 3, 4, 5, 6, 7, 8, 9])})
DecisionTreeRegressor()
DecisionTreeRegressor()
In [74]:
rscv_rf_reg.best_estimator_
Out[74]:
DecisionTreeRegressor(criterion='friedman_mse', max_depth=7, min_samples_leaf=3,
                      min_samples_split=8)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(criterion='friedman_mse', max_depth=7, min_samples_leaf=3,
                      min_samples_split=8)
In [75]:
rscv_rf_reg = DecisionTreeRegressor(criterion='poisson', max_depth=3, min_samples_leaf=8,
                      min_samples_split=8)
rscv_rf_reg
Out[75]:
DecisionTreeRegressor(criterion='poisson', max_depth=3, min_samples_leaf=8,
                      min_samples_split=8)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor(criterion='poisson', max_depth=3, min_samples_leaf=8,
                      min_samples_split=8)
In [76]:
rscv_rf_reg_model = rscv_rf_reg.fit(xtrain,ytrain)

model Evaluation for training¶

In [77]:
ytrain_pred = rscv_rf_reg_model.predict(xtrain)
In [78]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 2103156.847674949
mean absolute error : 1255.1765315363575
Root mean squared error : 1450.2264815107153
R2 score: 0.005772578966594999

model Evaluation for Testing¶

In [79]:
ytest_pred = rscv_rf_reg_model.predict(xtest)
In [80]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 2090347.9410888588
mean absolute error : 1251.2156414210908
Root mean squared error : 1445.8035624139466
R2 score: -0.0038720978481199264

Algorithm 4: AdaBoost¶

In [81]:
adb_reg = AdaBoostRegressor()
adb_reg_model = adb_reg.fit(xtrain,ytrain)
adb_reg_model
Out[81]:
AdaBoostRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AdaBoostRegressor()

model Evaluation for training¶

In [82]:
ytrain_pred = adb_reg_model.predict(xtrain)
In [83]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 2111358.455553671
mean absolute error : 1259.2899533595385
Root mean squared error : 1453.051429080771
R2 score: 0.0018954247454995299

model Evaluation for Testing¶

In [84]:
ytest_pred = adb_reg_model.predict(xtest)
In [85]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 2083499.1601479966
mean absolute error : 1250.8864015602019
Root mean squared error : 1443.4331159246683
R2 score: -0.0005830281407945836

Adaboost with hyperprameter tunning¶

In [86]:
hyp = {"n_estimators" : np.arange(2,100),
       "learning_rate" : [0, 0.1, 0.001, 0.0001, 1]}
In [87]:
RandomizedSearchCV(adb_reg_model, hyp, cv=5).fit(xtrain,ytrain).best_estimator_
C:\Users\prajw\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:378: FitFailedWarning: 
10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\prajw\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\prajw\anaconda3\lib\site-packages\sklearn\ensemble\_weight_boosting.py", line 124, in fit
    self._validate_params()
  File "C:\Users\prajw\anaconda3\lib\site-packages\sklearn\base.py", line 581, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\prajw\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'learning_rate' parameter of AdaBoostRegressor must be a float in the range (0, inf). Got 0 instead.

  warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\Users\prajw\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:952: UserWarning: One or more of the test scores are non-finite: [-0.0017325  -0.00123039 -0.0015858  -0.00018256         nan -0.000965
         nan -0.00167245 -0.00123635 -0.0010247 ]
  warnings.warn(
Out[87]:
AdaBoostRegressor(learning_rate=0.1, n_estimators=8)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AdaBoostRegressor(learning_rate=0.1, n_estimators=8)
In [88]:
adb_reg_hyp = AdaBoostRegressor(learning_rate=0.1, n_estimators=76)
adb_reg_hyp_model = adb_reg_hyp.fit(xtrain, ytrain)
adb_reg_hyp_model
Out[88]:
AdaBoostRegressor(learning_rate=0.1, n_estimators=76)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AdaBoostRegressor(learning_rate=0.1, n_estimators=76)

model Evaluation for training¶

In [89]:
ytrain_pred = adb_reg_hyp_model.predict(xtrain)
In [90]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 2109379.248700918
mean absolute error : 1258.6591436938827
Root mean squared error : 1452.3702175068581
R2 score: 0.002831057162586914

model Evaluation for Testing¶

In [91]:
ytest_pred = adb_reg_hyp_model.predict(xtest)
In [92]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 2084796.4621296413
mean absolute error : 1251.2237551578348
Root mean squared error : 1443.882426698809
R2 score: -0.001206046556177398

Algorithm 5 : GradientBoost¶

In [93]:
gdb_reg_model = GradientBoostingRegressor().fit(xtrain, ytrain)
gdb_reg_model
Out[93]:
GradientBoostingRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor()

model Evaluation for training¶

In [94]:
ytrain_pred = gdb_reg_model.predict(xtrain)
In [95]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 2001201.1718399662
mean absolute error : 1223.691900725033
Root mean squared error : 1414.6381770049775
R2 score: 0.05397018665201092

model Evaluation for Testing¶

In [96]:
ytest_pred = gdb_reg_model.predict(xtest)
In [97]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 2095747.1312670347
mean absolute error : 1252.2144979335435
Root mean squared error : 1447.6695518201088
R2 score: -0.006465013727963154

GradientBosst with Hyperparameter Tunning¶

In [98]:
hyp = {"n_estimators" : np.arange(2,100),
       "learning_rate" : [0, 0.1, 0.001, 0.0001, 1]}
In [99]:
RandomizedSearchCV(gdb_reg_model, hyp, cv=5).fit(xtrain, ytrain).best_estimator_
Out[99]:
GradientBoostingRegressor(learning_rate=0.001, n_estimators=37)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(learning_rate=0.001, n_estimators=37)
In [100]:
gdb_reg_hyp_model = GradientBoostingRegressor(learning_rate=0.001, n_estimators=42).fit(xtrain, ytrain)
gdb_reg_hyp_model
Out[100]:
GradientBoostingRegressor(learning_rate=0.001, n_estimators=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingRegressor(learning_rate=0.001, n_estimators=42)

model Evaluation for training¶

In [101]:
ytrain_pred = gdb_reg_hyp_model.predict(xtrain)
In [102]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
mean squared error : 2114316.2139810575
mean absolute error : 1260.159652620357
Root mean squared error : 1454.0688477445135
R2 score: 0.0004972006726960965

model Evaluation for Testing¶

In [103]:
ytest_pred = gdb_reg_hyp_model.predict(xtest)
In [104]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")
mean squared error : 2084210.8770516566
mean absolute error : 1251.0061098955455
Root mean squared error : 1443.6796310302561
R2 score: -0.0009248242251242988

Algorithm 6 : Support Vector Machine (SVC)¶

In [105]:
svc_model = SVC().fit(xtrain,ytrain)
svc_model
Out[105]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()

model Evaluation for training¶

In [ ]:
ytrain_pred = svc_model.predict(xtrain)
In [ ]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")

model Evaluation for Testing¶

In [ ]:
ytest_pred = svc_model.predict(xtest)
In [ ]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_pred)
print(f"R2 score: {r2score}")

SVC by hyperparameter tunning¶

In [ ]:
hyp = {"kernel" : ["linear", "rbf"], "C" : np.arange(2,15)}
hyp
In [ ]:
svc_hyp = SVC(kernel = "linear", C =7)
svc_hyp_model = svc_hyp.fit(xtrain,ytrain)
svc_hyp_model

model Evaluation for training¶

In [ ]:
ytrain_pred = svc_hyp_model.predict(xtrain)
In [ ]:
mse = mean_squared_error(ytrain,ytrain_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytrain,ytrain_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytrain,ytrain_pred)
print(f"R2 score: {r2score}")
In [ ]:
## model Evaluation for Testing
In [ ]:
ytest_pred = svc_hyp_model.predict(xtest)
In [ ]:
mse = mean_squared_error(ytest,ytest_pred)
print(f"mean squared error : {mse}")

mae = mean_absolute_error(ytest,ytest_pred)
print(f"mean absolute error : {mae}")

rmse = np.sqrt(mse)
print(f"Root mean squared error : {rmse}")

r2score =r2_score(ytest,ytest_p